-
Notifications
You must be signed in to change notification settings - Fork 30
/
RNASeq.py
5832 lines (5346 loc) · 312 KB
/
RNASeq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
###RNASeq
#Copyright 2005-2008 J. David Gladstone Institutes, San Francisco California
#Author Nathan Salomonis - nsalomonis@gmail.com
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is furnished
#to do so, subject to the following conditions:
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
import sys, string, os
from stats_scripts import statistics
import math
import os.path
import unique
import update
import copy
import time
import export
from build_scripts import EnsemblImport; reload(EnsemblImport)
try: from build_scripts import JunctionArrayEnsemblRules
except Exception: pass ### occurs with circular imports
try: from build_scripts import JunctionArray; reload(JunctionArray)
except Exception: pass ### occurs with circular imports
try: from build_scripts import ExonArrayEnsemblRules
except Exception: pass ### occurs with circular imports
import multiprocessing
import logging
import traceback
import warnings
import bisect
import shutil
from visualization_scripts import clustering; reload(clustering)
try:
import scipy
import scipy.cluster.hierarchy as sch
import scipy.spatial.distance as dist
except Exception: pass
try: import numpy
except Exception: pass
LegacyMode = True
try:
from scipy import average as Average
from scipy import stats
except Exception:
try: from statistics import avg as Average
except Exception: pass ### occurs with circular imports
def filepath(filename):
fn = unique.filepath(filename)
return fn
def read_directory(sub_dir):
dir_list_clean=[]
dir_list = unique.read_directory(sub_dir)
for filepath in dir_list:
if 'log.txt' not in filepath and '.log' not in filepath:
dir_list_clean.append(filepath)
return dir_list_clean
def makeUnique(item):
db1={}; list1=[]; k=0
for i in item:
try: db1[i]=[]
except TypeError: db1[tuple(i)]=[]; k=1
for i in db1:
if k==0: list1.append(i)
else: list1.append(list(i))
list1.sort()
return list1
def cleanUpLine(line):
line = string.replace(line,'\n','')
line = string.replace(line,'\c','')
data = string.replace(line,'\r','')
data = string.replace(data,'"','')
return data
######### Below code deals with building the AltDatabase #########
def collapseNoveExonBoundaries(novel_exon_coordinates,dataset_dir):
""" Merge exon predictions based on junction measurments from TopHat. The predicted exons are
bound by the identified splice site and the consensus length of reads in that sample"""
dataset_dir = string.replace(dataset_dir,'exp.','ExpressionInput/novel.')
export_data,status = AppendOrWrite(dataset_dir) ### Export all novel exons
if status == 'not found':
export_data.write('GeneID\tStrand\tExonID\tCoordinates\n')
novel_gene_exon_db={}
for (chr,coord) in novel_exon_coordinates:
key = (chr,coord)
ji,side,coord2 = novel_exon_coordinates[(chr,coord)]
try:
if side == 'left': ### left corresponds to the position of coord
intron = string.split(string.split(ji.ExonRegionID(),'-')[1][:2],'.')[0]
else:
intron = string.split(string.split(ji.ExonRegionID(),'-'),'.')[0]
ls = [coord,coord2]
ls.sort() ### The order of this is variable
if ji.Strand() == '-':
coord2,coord = ls
else: coord,coord2 = ls
if 'I' in intron and ji.Novel() == 'side':
#if 'ENSG00000221983' == ji.GeneID():
try: novel_gene_exon_db[ji.GeneID(),ji.Strand(),intron].append((coord,coord2,ji,key,side))
except Exception: novel_gene_exon_db[ji.GeneID(),ji.Strand(),intron] = [(coord,coord2,ji,key,side)]
except Exception: pass
outdatedExons={} ### merging novel exons, delete one of the two original
for key in novel_gene_exon_db:
firstNovel=True ### First putative novel exon coordinates examined for that gene
novel_gene_exon_db[key].sort()
if key[1]=='-':
novel_gene_exon_db[key].reverse()
for (c1,c2,ji,k,s) in novel_gene_exon_db[key]:
if firstNovel==False:
#print [c1,l2] #abs(c1-l2);sys.exit()
### see if the difference between the start position of the second exon is less than 300 nt away from the end of the last
if abs(c2-l1) < 300 and os!=s: ### 80% of human exons are less than 200nt - PMID: 15217358
proceed = True
#if key[1]=='-':
if c2 in k:
novel_exon_coordinates[k] = ji,s,l1
outdatedExons[ok]=None ### merged out entry
elif l1 in ok:
novel_exon_coordinates[ok] = li,os,c2
outdatedExons[k]=None ### merged out entry
else:
proceed = False ### Hence, the two splice-site ends are pointing to two distinct versus one common exons
"""
if c2 == 18683670 or l1 == 18683670:
print key,abs(c2-l1), c1, c2, l1, l2, li.ExonRegionID(), ji.ExonRegionID();
print k,novel_exon_coordinates[k]
print ok,novel_exon_coordinates[ok]
"""
if proceed:
values = string.join([ji.GeneID(),ji.Strand(),key[2],ji.Chr()+':'+str(l1)+'-'+str(c2)],'\t')+'\n'
export_data.write(values)
### For negative strand genes, c1 is larger than c2 but is the 5' begining of the exon
l1,l2,li,ok,os = c1,c2,ji,k,s ### record the last entry
firstNovel=False
for key in outdatedExons: ### Delete the non-merged entry
del novel_exon_coordinates[key]
export_data.close()
return novel_exon_coordinates
def exportNovelExonToBedCoordinates(species,novel_exon_coordinates,chr_status,searchChr=None):
### Export the novel exon coordinates based on those in the junction BED file to examine the differential expression of the predicted novel exon
#bamToBed -i accepted_hits.bam -split| coverageBed -a stdin -b /home/databases/hESC_differentiation_exons.bed > day20_7B__exons-novel.bed
bed_export_path = filepath('AltDatabase/'+species+'/RNASeq/chr/'+species + '_Ensembl_exons'+searchChr+'.bed')
bed_data = open(bed_export_path,'w') ### Appends to existing file
for (chr,coord) in novel_exon_coordinates:
ji,side,coord2 = novel_exon_coordinates[(chr,coord)]
if side == 'left': start,stop = coord,coord2
if side == 'right': start,stop = coord2,coord
try: gene = ji.GeneID()
except Exception: gene = 'NA'
if gene == None: gene = 'NA'
if gene == None: gene = 'NA'
if gene != 'NA': ### Including these has no benefit for AltAnalyze (just slows down alignment and piles up memory)
if ji.Strand() == '-': stop,start=start,stop
if chr_status == False:
chr = string.replace(chr,'chr','') ### This will thus match up to the BAM files
a = [start,stop]; a.sort(); start,stop = a
bed_values = [chr,str(start),str(stop),gene,'0',str(ji.Strand())]
bed_values = cleanUpLine(string.join(bed_values,'\t'))+'\n'
bed_data.write(bed_values)
bed_data.close()
return bed_export_path
def moveBAMtoBEDFile(species,dataset_name,root_dir):
bed_export_path = filepath('AltDatabase/'+species+'/RNASeq/'+species + '_Ensembl_exons.bed')
dataset_name = string.replace(dataset_name,'exp.','')
new_fn = root_dir+'/BAMtoBED/'+species + '_'+dataset_name+'_exons.bed'
new_fn = string.replace(new_fn,'.txt','')
print 'Writing exon-level coordinates to BED file:'
print new_fn
catFiles(bed_export_path,'chr') ### concatenate the files ot the main AltDatabase directory then move
export.customFileMove(bed_export_path,new_fn)
return new_fn
def reformatExonFile(species,type,chr_status):
if type == 'exon':
filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt'
export_path = 'AltDatabase/'+species+'/RNASeq/'+species + '_Ensembl_exons.txt'
### Used by BEDTools to get counts per specific AltAnalyze exon region (should augment with de novo regions identified from junction analyses)
bed_export_path = 'AltDatabase/'+species+'/RNASeq/chr/'+species + '_Ensembl_exons.bed'
bed_data = export.ExportFile(bed_export_path)
else:
filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_junction.txt'
export_path = 'AltDatabase/'+species+'/RNASeq/'+species + '_Ensembl_junctions.txt'
print 'Writing',export_path
try:
export_data = export.ExportFile(export_path)
fn=filepath(filename); x=0
for line in open(fn,'rU').xreadlines():
data = cleanUpLine(line)
t = string.split(data,'\t')
if x==0:
x+=1
export_title = ['AltAnalyzeID','exon_id','ensembl_gene_id','transcript_cluster_id','chromosome','strand','probeset_start','probeset_stop']
export_title +=['affy_class','constitutive_probeset','ens_exon_ids','ens_constitutive_status','exon_region','exon-region-start(s)','exon-region-stop(s)','splice_events','splice_junctions']
export_title = string.join(export_title,'\t')+'\n'; export_data.write(export_title)
else:
try: gene, exonid, chr, strand, start, stop, constitutive_call, ens_exon_ids, splice_events, splice_junctions = t
except Exception: print t;kill
if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention,
if constitutive_call == 'yes': ens_constitutive_status = '1'
else: ens_constitutive_status = '0'
export_values = [gene+':'+exonid, exonid, gene, '', chr, strand, start, stop, 'known', constitutive_call, ens_exon_ids, ens_constitutive_status]
export_values+= [exonid, start, stop, splice_events, splice_junctions]
export_values = string.join(export_values,'\t')+'\n'; export_data.write(export_values)
if type == 'exon':
if chr_status == False:
chr = string.replace(chr,'chr','') ### This will thus match up to the BAM files
bed_values = [chr,start,stop,gene+':'+exonid+'_'+ens_exon_ids,'0',strand]
bed_values = string.join(bed_values,'\t')+'\n'; bed_data.write(bed_values)
export_data.close()
except: pass ### occurs for machines with write permission errors to the AltAnalyze directory (fixed in 2.1.4)
try:
if type == 'exon': bed_data.close()
except: pass
def importExonAnnotations(species,type,search_chr):
if 'exon' in type:
filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt'
else:
filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_junction.txt'
fn=filepath(filename); x=0; exon_annotation_db={}
for line in open(fn,'rU').xreadlines():
data = cleanUpLine(line)
t = string.split(data,'\t')
if x==0: x=1
else:
gene, exonid, chr, strand, start, stop, constitutive_call, ens_exon_ids, splice_events, splice_junctions = t; proceed = 'yes'
if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
if len(search_chr)>0:
if chr != search_chr: proceed = 'no'
if proceed == 'yes':
if type == 'exon': start = int(start); stop = int(stop)
ea = EnsemblImport.ExonAnnotationsSimple(chr, strand, start, stop, gene, ens_exon_ids, constitutive_call, exonid, splice_events, splice_junctions)
if type == 'junction_coordinates':
exon1_start,exon1_stop = string.split(start,'|')
exon2_start,exon2_stop = string.split(stop,'|')
if strand == '-':
exon1_stop,exon1_start = exon1_start,exon1_stop
exon2_stop,exon2_start = exon2_start,exon2_stop
#if gene == 'ENSMUSG00000027340': print chr,int(exon1_stop),int(exon2_start)
exon_annotation_db[chr,int(exon1_stop),int(exon2_start)]=ea
elif type == 'distal-exon':
exon_annotation_db[gene] = exonid
else:
try: exon_annotation_db[gene].append(ea)
except KeyError: exon_annotation_db[gene]=[ea]
return exon_annotation_db
def exportKnownJunctionComparisons(species):
gene_junction_db = JunctionArrayEnsemblRules.importEnsemblUCSCAltJunctions(species,'standard')
gene_intronjunction_db = JunctionArrayEnsemblRules.importEnsemblUCSCAltJunctions(species,'_intronic')
for i in gene_intronjunction_db: gene_junction_db[i]=[]
gene_junction_db2={}
for (gene,critical_exon,incl_junction,excl_junction) in gene_junction_db:
critical_exons = string.split(critical_exon,'|')
for critical_exon in critical_exons:
try: gene_junction_db2[gene,incl_junction,excl_junction].append(critical_exon)
except Exception: gene_junction_db2[gene,incl_junction,excl_junction] = [critical_exon]
gene_junction_db = gene_junction_db2; gene_junction_db2=[]
junction_export = 'AltDatabase/' + species + '/RNASeq/'+ species + '_junction_comps.txt'
fn=filepath(junction_export); data = open(fn,'w')
print "Exporting",junction_export
title = 'gene'+'\t'+'critical_exon'+'\t'+'exclusion_junction_region'+'\t'+'inclusion_junction_region'+'\t'+'exclusion_probeset'+'\t'+'inclusion_probeset'+'\t'+'data_source'+'\n'
data.write(title); temp_list=[]
for (gene,incl_junction,excl_junction) in gene_junction_db:
critical_exons = unique.unique(gene_junction_db[(gene,incl_junction,excl_junction)])
critical_exon = string.join(critical_exons,'|')
temp_list.append(string.join([gene,critical_exon,excl_junction,incl_junction,gene+':'+excl_junction,gene+':'+incl_junction,'AltAnalyze'],'\t')+'\n')
temp_list = unique.unique(temp_list)
for i in temp_list: data.write(i)
data.close()
def getExonAndJunctionSequences(species):
export_exon_filename = 'AltDatabase/'+species+'/RNASeq/'+species+'_Ensembl_exons.txt'
ensembl_exon_db = ExonArrayEnsemblRules.reimportEnsemblProbesetsForSeqExtraction(export_exon_filename,'null',{})
### Import just the probeset region for mRNA alignment analysis
analysis_type = ('region_only','get_sequence'); array_type = 'RNASeq'
dir = 'AltDatabase/'+species+'/SequenceData/chr/'+species; gene_seq_filename = dir+'_gene-seq-2000_flank.fa'
ensembl_exon_db = EnsemblImport.import_sequence_data(gene_seq_filename,ensembl_exon_db,species,analysis_type)
critical_exon_file = 'AltDatabase/'+species+'/'+ array_type + '/' + array_type+'_critical-exon-seq.txt'
getCriticalJunctionSequences(critical_exon_file,species,ensembl_exon_db)
"""
### Import the full Ensembl exon sequence (not just the probeset region) for miRNA binding site analysis
analysis_type = 'get_sequence'; array_type = 'RNASeq'
dir = 'AltDatabase/'+species+'/SequenceData/chr/'+species; gene_seq_filename = dir+'_gene-seq-2000_flank.fa'
ensembl_exon_db = EnsemblImport.import_sequence_data(gene_seq_filename,ensembl_exon_db,species,analysis_type)
"""
critical_exon_file = 'AltDatabase/'+species+'/'+ array_type + '/' + array_type+'_critical-exon-seq.txt'
updateCriticalExonSequences(critical_exon_file, ensembl_exon_db)
def updateCriticalExonSequences(filename,ensembl_exon_db):
exon_seq_db_filename = filename[:-4]+'_updated.txt'
exonseq_data = export.ExportFile(exon_seq_db_filename)
critical_exon_seq_db={}; null_count={}
for gene in ensembl_exon_db:
gene_exon_data={}
for probe_data in ensembl_exon_db[gene]:
exon_id,((probe_start,probe_stop,probeset_id,exon_class,transcript_clust),ed) = probe_data
try: gene_exon_data[probeset_id] = ed.ExonSeq()
except Exception: null_count[gene]=[] ### Occurs for non-chromosomal DNA (could also download this sequence though)
if len(gene_exon_data)>0: critical_exon_seq_db[gene] = gene_exon_data
print len(null_count),'genes not assigned sequenced (e.g.,non-chromosomal)'
ensembl_exon_db=[]
### Export exon sequences
for gene in critical_exon_seq_db:
gene_exon_data = critical_exon_seq_db[gene]
for probeset in gene_exon_data:
critical_exon_seq = gene_exon_data[probeset]
values = [probeset,'',critical_exon_seq]
values = string.join(values,'\t')+'\n'
exonseq_data.write(values)
exonseq_data.close()
print exon_seq_db_filename, 'exported....'
def getCriticalJunctionSequences(filename,species,ensembl_exon_db):
### Assemble and export junction sequences
junction_seq_db_filename = string.replace(filename,'exon-seq','junction-seq')
junctionseq_data = export.ExportFile(junction_seq_db_filename)
critical_exon_seq_db={}; null_count={}
for gene in ensembl_exon_db:
gene_exon_data={}
for probe_data in ensembl_exon_db[gene]:
exon_id,((probe_start,probe_stop,probeset_id,exon_class,transcript_clust),ed) = probe_data
try: gene_exon_data[probeset_id] = ed.ExonSeq()
except Exception: null_count[gene]=[] ### Occurs for non-chromosomal DNA (could also download this sequence though)
if len(gene_exon_data)>0: critical_exon_seq_db[gene] = gene_exon_data
print len(null_count),'genes not assigned sequenced (e.g.,non-chromosomal)'
ensembl_exon_db=[]
junction_annotation_db = importExonAnnotations(species,'junction',[])
for gene in junction_annotation_db:
if gene in critical_exon_seq_db:
gene_exon_data = critical_exon_seq_db[gene]
for jd in junction_annotation_db[gene]:
exon1,exon2=string.split(jd.ExonRegionIDs(),'-')
p1=gene+':'+exon1
p2=gene+':'+exon2
p1_seq=gene_exon_data[p1][-15:]
p2_seq=gene_exon_data[p2][:15]
junction_seq = p1_seq+'|'+p2_seq
junctionseq_data.write(gene+':'+jd.ExonRegionIDs()+'\t'+junction_seq+'\t\n')
junctionseq_data.close()
print junction_seq_db_filename, 'exported....'
def getEnsemblAssociations(species,data_type,test_status,force):
### Get UCSC associations (download databases if necessary)
from build_scripts import UCSCImport
mRNA_Type = 'mrna'; run_from_scratch = 'yes'
export_all_associations = 'no' ### YES only for protein prediction analysis
update.buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force)
null = EnsemblImport.getEnsemblAssociations(species,data_type,test_status); null=[]
reformatExonFile(species,'exon',True); reformatExonFile(species,'junction',True)
exportKnownJunctionComparisons(species)
getExonAndJunctionSequences(species)
######### Below code deals with user read alignment as opposed to building the AltDatabase #########
class ExonInfo:
def __init__(self,start,unique_id,annotation):
self.start = start; self.unique_id = unique_id; self.annotation = annotation
def ReadStart(self): return self.start
def UniqueID(self): return self.unique_id
def Annotation(self): return self.annotation
def setExonRegionData(self,rd): self.rd = rd
def ExonRegionData(self): return self.rd
def setExonRegionID(self,region_id): self.region_id = region_id
def ExonRegionID(self): return self.region_id
def setAlignmentRegion(self,region_type): self.region_type = region_type
def AlignmentRegion(self): return self.region_type
def __repr__(self): return "ExonData values"
class JunctionData:
def __init__(self,chr,strand,exon1_stop,exon2_start,junction_id,biotype):
self.chr = chr; self.strand = strand; self._chr = chr
self.exon1_stop = exon1_stop; self.exon2_start = exon2_start
self.junction_id = junction_id; self.biotype = biotype
#self.reads = reads; self.condition = condition
self.left_exon = None; self.right_exon = None; self.jd = None; self.gene_id = None
self.trans_splicing = None
self.splice_events=''
self.splice_junctions=''
self.seq_length=''
self.uid = None
def Chr(self): return self.chr
def Strand(self): return self.strand
def Exon1Stop(self): return self.exon1_stop
def Exon2Start(self): return self.exon2_start
def setExon1Stop(self,exon1_stop): self.exon1_stop = exon1_stop
def setExon2Start(self,exon2_start): self.exon2_start = exon2_start
def setSeqLength(self,seq_length): self.seq_length = seq_length
def SeqLength(self): return self.seq_length
def BioType(self): return self.biotype
def checkExonPosition(self,exon_pos):
if exon_pos == self.Exon1Stop(): return 'left'
else: return 'right'
### These are used to report novel exon boundaries
def setExon1Start(self,exon1_start): self.exon1_start = exon1_start
def setExon2Stop(self,exon2_stop): self.exon2_stop = exon2_stop
def Exon1Start(self): return self.exon1_start
def Exon2Stop(self): return self.exon2_stop
def Reads(self): return self.reads
def JunctionID(self): return self.junction_id
def Condition(self): return self.condition
def setExonAnnotations(self,jd):
self.jd = jd
self.splice_events = jd.AssociatedSplicingEvent()
self.splice_junctions = jd.AssociatedSplicingJunctions()
self.exon_region = jd.ExonRegionIDs()
self.exonid = jd.ExonID()
self.gene_id = jd.GeneID()
self.uid = jd.GeneID()+':'+jd.ExonRegionIDs()
def ExonAnnotations(self): return self.jd
def setLeftExonAnnotations(self,ld): self.gene_id,self.left_exon = ld
def LeftExonAnnotations(self): return self.left_exon
def setRightExonAnnotations(self,rd): self.secondary_geneid,self.right_exon = rd
def RightExonAnnotations(self): return self.right_exon
def setGeneID(self,geneid): self.gene_id = geneid
def GeneID(self): return self.gene_id
def setSecondaryGeneID(self,secondary_geneid): self.secondary_geneid = secondary_geneid
def SecondaryGeneID(self): return self.secondary_geneid
def setTransSplicing(self): self.trans_splicing = 'yes'
def TransSplicing(self): return self.trans_splicing
def SpliceSitesFound(self):
if self.jd != None: sites_found = 'both'
elif self.left_exon != None and self.right_exon != None: sites_found = 'both'
elif self.left_exon != None: sites_found = 'left'
elif self.right_exon != None: sites_found = 'right'
else: sites_found = None
return sites_found
def setConstitutive(self,constitutive): self.constitutive = constitutive
def Constitutive(self): return self.constitutive
def setAssociatedSplicingEvent(self,splice_events): self.splice_events = splice_events
def AssociatedSplicingEvent(self): return self.splice_events
def setAssociatedSplicingJunctions(self,splice_junctions): self.splice_junctions = splice_junctions
def AssociatedSplicingJunctions(self): return self.splice_junctions
def setExonID(self,exonid): self.exonid = exonid
def ExonID(self): return self.exonid
def setExonRegionID(self,exon_region): self.exon_region = exon_region
def ExonRegionID(self): return self.exon_region
def setUniqueID(self,uid): self.uid = uid
def UniqueID(self): return self.uid
def setLeftExonRegionData(self,li): self.li = li
def LeftExonRegionData(self): return self.li
def setRightExonRegionData(self,ri): self.ri = ri
def RightExonRegionData(self): return self.ri
def setNovel(self, side): self.side = side
def Novel(self): return self.side
def __repr__(self): return "JunctionData values"
def checkBEDFileFormat(bed_dir,root_dir):
""" This method checks to see if the BED files (junction or exon) have 'chr' proceeding the chr number.
It also checks to see if some files have two underscores and one has none or if double underscores are missing from all."""
dir_list = read_directory(bed_dir)
x=0
break_now = False
chr_present = False
condition_db={}
for filename in dir_list:
fn=filepath(bed_dir+filename)
#if ('.bed' in fn or '.BED' in fn): delim = 'r'
delim = 'rU'
if '.tab' in string.lower(filename) or '.bed' in string.lower(filename) or '.junction_quantification.txt' in string.lower(filename):
condition_db[filename]=[]
for line in open(fn,delim).xreadlines(): ### changed rU to r to remove \r effectively, rather than read as end-lines
if line[0] == '#': x=0 ### BioScope
elif x == 0: x=1 ###skip the first line
elif x < 10: ### Only check the first 10 lines
if 'chr' in line: ### Need to look at multiple input formats (chr could be in t[0] or t[1])
chr_present = True
x+=1
else:
break_now = True
break
if break_now == True:
break
### Check to see if exon.bed and junction.bed file names are propper or faulty (which will result in downstream errors)
double_underscores=[]
no_doubles=[]
for condition in condition_db:
if '__' in condition:
double_underscores.append(condition)
else:
no_doubles.append(condition)
exon_beds=[]
junctions_beds=[]
if len(double_underscores)>0 and len(no_doubles)>0:
### Hence, a problem is likely due to inconsistent naming
print 'The input files appear to have inconsistent naming. If both exon and junction sample data are present, make sure they are named propperly.'
print 'For example: cancer1__exon.bed, cancer1__junction.bed (double underscore required to match these samples up)!'
print 'Exiting AltAnalyze'; forceError
elif len(no_doubles)>0:
for condition in no_doubles:
condition = string.lower(condition)
if 'exon' in condition:
exon_beds.append(condition)
if 'junction' in condition:
junctions_beds.append(condition)
if len(exon_beds)>0 and len(junctions_beds)>0:
print 'The input files appear to have inconsistent naming. If both exon and junction sample data are present, make sure they are named propperly.'
print 'For example: cancer1__exon.bed, cancer1__junction.bed (double underscore required to match these samples up)!'
print 'Exiting AltAnalyze'; forceError
return chr_present
def getStrandMappingData(species):
splicesite_db={}
refExonCoordinateFile = unique.filepath('AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt')
firstLine=True
for line in open(refExonCoordinateFile,'rU').xreadlines():
if firstLine: firstLine=False
else:
line = line.rstrip('\n')
t = string.split(line,'\t'); #'gene', 'exon-id', 'chromosome', 'strand', 'exon-region-start(s)', 'exon-region-stop(s)', 'constitutive_call', 'ens_exon_ids', 'splice_events', 'splice_junctions'
geneID, exon, chr, strand, start, stop = t[:6]
splicesite_db[chr,int(start)]=strand
splicesite_db[chr,int(stop)]=strand
return splicesite_db
def importBEDFile(bed_dir,root_dir,species,normalize_feature_exp,getReads=False,searchChr=None,getBiotype=None,testImport=False,filteredJunctions=None):
dir_list = read_directory(bed_dir)
begin_time = time.time()
if 'chr' not in searchChr:
searchChr = 'chr'+searchChr
condition_count_db={}; neg_count=0; pos_count=0; junction_db={}; biotypes={}; algorithms={}; exon_len_db={}; splicesite_db={}
if testImport == 'yes': print "Reading user RNA-seq input data files"
for filename in dir_list:
count_db={}; rows=0
fn=filepath(bed_dir+filename)
condition = export.findFilename(fn)
if '__' in condition:
### Allow multiple junction files per sample to be combined (e.g. canonical and non-canonical junction alignments)
condition=string.split(condition,'__')[0]+filename[-4:]
if ('.bed' in fn or '.BED' in fn or '.tab' in fn or '.TAB' in fn or '.junction_quantification.txt' in fn) and '._' not in condition:
if ('.bed' in fn or '.BED' in fn): delim = 'r'
else: delim = 'rU'
### The below code removes .txt if still in the filename along with .tab or .bed
if '.tab' in fn: condition = string.replace(condition,'.txt','.tab')
elif '.bed' in fn: condition = string.replace(condition,'.txt','.bed')
if '.TAB' in fn: condition = string.replace(condition,'.txt','.TAB')
elif '.BED' in fn: condition = string.replace(condition,'.txt','.BED')
if testImport == 'yes': print "Reading the bed file", [fn], condition
### If the BED was manually created on a Mac, will neeed 'rU' - test this
for line in open(fn,delim).xreadlines(): break
try:
if len(line)>500: delim = 'rU'
except:
pass
for line in open(fn,delim).xreadlines(): ### changed rU to r to remove \r effectively, rather than read as end-lines
data = cleanUpLine(line)
t = string.split(data,'\t')
rows+=1
if rows==1 or '#' == data[0]:
format_description = data
algorithm = 'Unknown'
if 'TopHat' in format_description: algorithm = 'TopHat'
elif 'HMMSplicer' in format_description: algorithm = 'HMMSplicer'
elif 'SpliceMap junctions' in format_description: algorithm = 'SpliceMap'
elif t[0] == 'E1': algorithm = 'BioScope-junction'
elif '# filterOrphanedMates=' in data or 'alignmentFilteringMode=' in data or '#number_of_mapped_reads=' in data:
algorithm = 'BioScope-exon'
elif '.junction_quantification.txt' in fn:
algorithm = 'TCGA format'
if 'barcode' in t: junction_position = 1
else: junction_position = 0
if 'Hybridization' in line:
rows = 0
elif '.tab' in fn and len(t)==9:
try: start = float(t[1]) ### expect this to be a numerical coordinate
except Exception: continue
algorithm = 'STAR'
strand = '-' ### If no strand exists
rows=2 ### allows this first row to be processed
if len(splicesite_db)==0: ### get strand to pos info
splicesite_db = getStrandMappingData(species)
if testImport == 'yes': print condition, algorithm
if rows>1:
try:
if ':' in t[0]:
chr = string.split(t[0],':')[0]
else: chr = t[0]
if 'chr' not in chr:
chr = 'chr'+chr
if searchChr == chr or ('BioScope' in algorithm and searchChr == t[1]): proceed = True
elif searchChr == 'chrMT' and ('BioScope' not in algorithm):
if 'M' in chr and len(chr)<6: proceed = True ### If you don't have the length, any random thing with an M will get included
else: proceed = False
else: proceed = False
except IndexError:
print 'The input file:\n',filename
print 'is not formated as expected (format='+algorithm+').'
print 'search chromosome:',searchChr
print t; force_bad_exit
if proceed:
proceed = False
if '.tab' in fn or '.TAB' in fn:
### Applies to non-BED format Junction and Exon inputs (BioScope)
if 'BioScope' in algorithm:
if algorithm == 'BioScope-exon': ### Not BED format
chr,source,data_type,start,end,reads,strand,null,gene_info=t[:9]
if 'chr' not in chr: chr = 'chr'+chr
if data_type == 'exon': ### Can also be CDS
gene_info,test,rpkm_info,null = string.split(gene_info,';')
symbol = string.split(gene_info,' ')[-1]
#refseq = string.split(transcript_info,' ')[-1]
rpkm = string.split(rpkm_info,' ')[-1]
#if normalize_feature_exp == 'RPKM': reads = rpkm ### The RPKM should be adjusted +1 counts, so don't use this
biotype = 'exon'; biotypes[biotype]=[]
exon1_stop,exon2_start = int(start),int(end); junction_id=''
### Adjust exon positions - not ideal but necessary. Needed as a result of exon regions overlapping by 1nt (due to build process)
exon1_stop+=1; exon2_start-=1
#if float(reads)>4 or getReads:
proceed = True ### Added in version 2.0.9 to remove rare novel isoforms
seq_length = abs(exon1_stop-exon2_start)
if algorithm == 'BioScope-junction':
chr = t[1]; strand = t[2]; exon1_stop = int(t[4]); exon2_start = int(t[8]); count_paired = t[17]; count_single = t[19]; score=t[21]
if 'chr' not in chr: chr = 'chr'+chr
try: exon1_start = int(t[3]); exon2_stop = int(t[9])
except Exception: pass ### If missing, these are not assigned
reads = str(int(float(count_paired))+int(float(count_single))) ### Users will either have paired or single read (this uses either)
biotype = 'junction'; biotypes[biotype]=[]; junction_id=''
if float(reads)>4 or getReads: proceed = True ### Added in version 2.0.9 to remove rare novel isoforms
seq_length = abs(float(exon1_stop-exon2_start))
if 'STAR' in algorithm:
chr = t[0]; exon1_stop = int(t[1])-1; exon2_start = int(t[2])+1; strand=''
if 'chr' not in chr: chr = 'chr'+chr
reads = str(int(t[7])+int(t[6]))
biotype = 'junction'; biotypes[biotype]=[]; junction_id=''
if float(reads)>4 or getReads: proceed = True ### Added in version 2.0.9 to remove rare novel isoforms
if (chr,exon1_stop) in splicesite_db:
strand = splicesite_db[chr,exon1_stop]
elif (chr,exon2_start) in splicesite_db:
strand = splicesite_db[chr,exon2_start]
#else: proceed = False
seq_length = abs(float(exon1_stop-exon2_start))
if strand == '-': ### switch the orientation of the positions
exon1_stop,exon2_start=exon2_start,exon1_stop
exon1_start = exon1_stop; exon2_stop = exon2_start
#if 9996685==exon1_stop and 10002682==exon2_stop:
#print chr, strand, reads, exon1_stop, exon2_start,proceed;sys.exit()
else:
try:
if algorithm == 'TCGA format':
coordinates = string.split(t[junction_position],',')
try: chr,pos1,strand = string.split(coordinates[0],':')
except Exception: print t;sys.exit()
chr,pos2,strand = string.split(coordinates[1],':')
if 'chr' not in chr: chr = 'chr'+chr
if abs(int(pos1)-int(pos2))<2:
pos2 = str(int(pos2)-1) ### This is the bed format conversion with exons of 0 length
exon1_start, exon2_stop = pos1, pos2
reads = t[junction_position+1]
junction_id = t[junction_position]
exon1_len=0; exon2_len=0
else:
### Applies to BED format Junction input
chr, exon1_start, exon2_stop, junction_id, reads, strand, null, null, null, null, lengths, null = t
if 'chr' not in chr: chr = 'chr'+chr
exon1_len,exon2_len=string.split(lengths,',')[:2]; exon1_len = int(exon1_len); exon2_len = int(exon2_len)
exon1_start = int(exon1_start); exon2_stop = int(exon2_stop)
biotype = 'junction'; biotypes[biotype]=[]
if strand == '-':
if (exon1_len+exon2_len)==0: ### Kallisto-Splice directly reports these coordinates
exon1_stop = exon1_start
exon2_start = exon2_stop
else:
exon1_stop = exon1_start+exon1_len; exon2_start=exon2_stop-exon2_len+1
### Exons have the opposite order
a = exon1_start,exon1_stop; b = exon2_start,exon2_stop
exon1_stop,exon1_start = b; exon2_stop,exon2_start = a
else:
if (exon1_len+exon2_len)==0: ### Kallisto-Splice directly reports these coordinates
exon1_stop = exon1_start
exon2_start= exon2_stop
else:
exon1_stop = exon1_start+exon1_len; exon2_start=exon2_stop-exon2_len+1
if float(reads)>4 or getReads: proceed = True
if algorithm == 'HMMSplicer':
if '|junc=' in junction_id: reads = string.split(junction_id,'|junc=')[-1]
else: proceed = False
if algorithm == 'SpliceMap':
if ')' in junction_id and len(junction_id)>1: reads = string.split(junction_id,')')[0][1:]
else: proceed = False
seq_length = abs(float(exon1_stop-exon2_start)) ### Junction distance
except Exception,e:
#print traceback.format_exc();sys.exit()
### Applies to BED format exon input (BEDTools export)
# bamToBed -i accepted_hits.bam -split| coverageBed -a stdin -b /home/nsalomonis/databases/Mm_Ensembl_exons.bed > day0_8B__exons.bed
try: chr, start, end, exon_id, null, strand, reads, bp_coverage, bp_total, percent_coverage = t
except Exception:
print 'The file',fn,'does not appear to be propperly formatted as input.'
print t; force_exception
if 'chr' not in chr: chr = 'chr'+chr
algorithm = 'TopHat-exon'; biotype = 'exon'; biotypes[biotype]=[]
try:
exon1_stop,exon2_start = int(start),int(end); junction_id=exon_id; seq_length = float(bp_total)
except:
print fn
print line
kill
if seq_length == 0:
seq_length = abs(float(exon1_stop-exon2_start))
### Adjust exon positions - not ideal but necessary. Needed as a result of exon regions overlapping by 1nt (due to build process)
exon1_stop+=1; exon2_start-=1
#if float(reads)>4 or getReads: ### Added in version 2.0.9 to remove rare novel isoforms
proceed = True
#else: proceed = False
if proceed:
if 'chr' not in chr:
chr = 'chr'+chr ### Add the chromosome prefix
if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
if strand == '+': pos_count+=1
else: neg_count+=1
if getReads and seq_length>0:
if getBiotype == biotype:
if biotype == 'junction':
### We filtered for junctions>4 reads before, now we include all reads for expressed junctions
if (chr,exon1_stop,exon2_start) in filteredJunctions:
count_db[chr,exon1_stop,exon2_start] = reads
try: exon_len_db[chr,exon1_stop,exon2_start] = seq_length
except Exception: exon_len_db[chr,exon1_stop,exon2_start] = []
else:
count_db[chr,exon1_stop,exon2_start] = reads
try: exon_len_db[chr,exon1_stop,exon2_start] = seq_length
except Exception: exon_len_db[chr,exon1_stop,exon2_start] = []
elif seq_length>0:
if (chr,exon1_stop,exon2_start) not in junction_db:
ji = JunctionData(chr,strand,exon1_stop,exon2_start,junction_id,biotype)
junction_db[chr,exon1_stop,exon2_start] = ji
try: ji.setSeqLength(seq_length) ### If RPKM imported or calculated
except Exception: null=[]
try: ji.setExon1Start(exon1_start);ji.setExon2Stop(exon2_stop)
except Exception: null=[]
key = chr,exon1_stop,exon2_start
algorithms[algorithm]=[]
if getReads:
if condition in condition_count_db:
### combine the data from the different files for the same sample junction alignments
count_db1 = condition_count_db[condition]
for key in count_db:
if key not in count_db1: count_db1[key] = count_db[key]
else:
combined_counts = int(count_db1[key])+int(count_db[key])
count_db1[key] = str(combined_counts)
condition_count_db[condition]=count_db1
else:
try: condition_count_db[condition] = count_db
except Exception: null=[] ### Occurs for other text files in the directory that are not used for the analysis
end_time = time.time()
if testImport == 'yes': print 'Read coordinates imported in',int(end_time-begin_time),'seconds'
if getReads:
#print len(exon_len_db), getBiotype, 'read counts present for',algorithm
return condition_count_db,exon_len_db,biotypes,algorithms
else:
if testImport == 'yes':
if 'exon' not in biotypes and 'BioScope' not in algorithm:
print len(junction_db),'junctions present in',algorithm,'format BED files.' # ('+str(pos_count),str(neg_count)+' by strand).'
elif 'exon' in biotypes and 'BioScope' not in algorithm:
print len(junction_db),'sequence identifiers present in input files.'
else: print len(junction_db),'sequence identifiers present in BioScope input files.'
return junction_db,biotypes,algorithms
def importExonCoordinates(probeCoordinateFile,search_chr,getBiotype):
probe_coordinate_db={}
junction_db={}
biotypes={}
x=0
fn=filepath(probeCoordinateFile)
for line in open(fn,'rU').xreadlines(): ### changed rU to r to remove \r effectively, rather than read as end-lines
data = cleanUpLine(line)
if x==0: x=1
else:
t = string.split(data,'\t')
probe_id = t[0]; probeset_id=t[1]; chr=t[2]; strand=t[3]; start=t[4]; end=t[5]
exon1_stop,exon2_start = int(start),int(end)
seq_length = abs(float(exon1_stop-exon2_start))
if 'chr' not in chr:
chr = 'chr'+chr ### Add the chromosome prefix
if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
if search_chr == chr or search_chr == None:
try: biotype = t[6]
except Exception:
if seq_length>25:biotype = 'junction'
else: biotype = 'exon'
if strand == '-':
exon1_stop,exon2_start = exon2_start, exon1_stop ### this is their actual 5' -> 3' orientation
if biotype == 'junction':
exon1_start,exon2_stop = exon1_stop,exon2_start
else:
exon1_stop+=1; exon2_start-=1
biotypes[biotype]=[]
if getBiotype == biotype or getBiotype == None:
ji = JunctionData(chr,strand,exon1_stop,exon2_start,probe_id,biotype)
junction_db[chr,exon1_stop,exon2_start] = ji
try: ji.setSeqLength(seq_length) ### If RPKM imported or calculated
except Exception: null=[]
try: ji.setExon1Start(exon1_start);ji.setExon2Stop(exon2_stop)
except Exception: null=[]
probe_coordinate_db[probe_id] = chr,exon1_stop,exon2_start ### Import the expression data for the correct chromosomes with these IDs
return probe_coordinate_db, junction_db, biotypes
def importExpressionMatrix(exp_dir,root_dir,species,fl,getReads,search_chr=None,getBiotype=None):
""" Non-RNA-Seq expression data (typically Affymetrix microarray) import and mapping to an external probe-coordinate database """
begin_time = time.time()
condition_count_db={}; neg_count=0; pos_count=0; algorithms={}; exon_len_db={}
probe_coordinate_db, junction_db, biotypes = importExonCoordinates(fl.ExonMapFile(),search_chr,getBiotype)
x=0
fn=filepath(exp_dir)[:-1]
condition = export.findFilename(fn)
### If the BED was manually created on a Mac, will neeed 'rU' - test this
for line in open(fn,'rU').xreadlines(): ### changed rU to r to remove \r effectively, rather than read as end-lines
data = cleanUpLine(line)
t = string.split(data,'\t')
if '#' == data[0]: None
elif x==0:
if 'block' in t:
start_index = 7
else:
start_index = 1
headers = t[start_index:]
x=1
else:
proceed = 'yes' ### restrict by chromosome with minimum line parsing (unless we want counts instead)
probe_id=t[0]
if probe_id in probe_coordinate_db:
key = probe_coordinate_db[probe_id]
if getReads == 'no':
pass
else:
expression_data = t[start_index:]
i=0
for sample in headers:
if sample in condition_count_db:
count_db = condition_count_db[sample]
count_db[key] = expression_data[i]
exon_len_db[key]=[]
else:
count_db={}
count_db[key] = expression_data[i]
condition_count_db[sample] = count_db
exon_len_db[key]=[]
i+=1
algorithms['ProbeData']=[]
end_time = time.time()
if testImport == 'yes': print 'Probe data imported in',int(end_time-begin_time),'seconds'
if getReads == 'yes':
return condition_count_db,exon_len_db,biotypes,algorithms
else:
return junction_db,biotypes,algorithms
def adjustCounts(condition_count_db,exon_len_db):
for key in exon_len_db:
try:
null=exon_len_db[key]
for condition in condition_count_db:
count_db = condition_count_db[condition]
try: read_count = float(count_db[key])+1 ###This adjustment allows us to obtain more realist folds where 0 is compared and use log2
except KeyError: read_count = 1 ###Was zero, but needs to be one for more realistic log2 fold calculations
count_db[key] = str(read_count) ### Replace original counts with adjusted counts
except Exception: null=[]
return condition_count_db
def calculateRPKM(condition_count_db,exon_len_db,biotype_to_examine):
"""Determines the total number of reads in a sample and then calculates RPMK relative to a pre-determined junction length (60).
60 was choosen, based on Illumina single-end read lengths of 35 (5 nt allowed overhand on either side of the junction)"""
### Get the total number of mapped reads
mapped_reads={}
for condition in condition_count_db:
mapped_reads[condition]=0
count_db = condition_count_db[condition]
for key in count_db:
read_count = count_db[key]
mapped_reads[condition]+=float(read_count)
### Use the average_total_reads when no counts reported such that 0 counts are comparable
average_total_reads = 0
for i in mapped_reads:
average_total_reads+=mapped_reads[i]
if testImport == 'yes':
print 'condition:',i,'total reads:',mapped_reads[i]
average_total_reads = average_total_reads/len(condition_count_db)
if testImport == 'yes':
print 'average_total_reads:',average_total_reads
k=0
c=math.pow(10.0,9.0)
for key in exon_len_db:
try:
for condition in condition_count_db:
total_mapped_reads = mapped_reads[condition]
try: read_count = float(condition_count_db[condition][key])+1 ###This adjustment allows us to obtain more realist folds where 0 is compared and use log2
except KeyError: read_count = 1 ###Was zero, but needs to be one for more realistic log2 fold calculations
if biotype_to_examine == 'junction': region_length = 60.0
else:
try: region_length = exon_len_db[key]
except Exception: continue ### This should only occur during testing (when restricting to one or few chromosomes)
if read_count == 1: ###This adjustment allows us to obtain more realist folds where 0 is compared and use log2
rpkm = c*(float(read_count)/(float(average_total_reads)*region_length))
try:
if region_length == 0:
region_length = abs(int(key[2]-key[1]))
rpkm = c*(read_count/(float(total_mapped_reads)*region_length))
except Exception:
print condition, key
print 'Error Encountered... Exon or Junction of zero length encoutered... RPKM failed... Exiting AltAnalyze.'
print 'This error may be due to inconsistent file naming. If both exon and junction sample data is present, make sure they are named propperly.'
print 'For example: cancer1__exon.bed, cancer1__junction.bed (double underscore required to match these samples up)!'
print [read_count,total_mapped_reads,region_length];k=1; forceError
condition_count_db[condition][key] = str(rpkm) ### Replace original counts with RPMK
except Exception:
if k == 1: kill
null=[]
return condition_count_db
def calculateGeneLevelStatistics(steady_state_export,species,expressed_gene_exon_db,normalize_feature_exp,array_names,fl,excludeLowExp=True,exportRPKMs=False):
global UserOptions; UserOptions = fl
exp_file = string.replace(steady_state_export,'-steady-state','')
if normalize_feature_exp == 'RPKM':
exp_dbase, all_exp_features, array_count = importRawCountData(exp_file,expressed_gene_exon_db,excludeLowExp=excludeLowExp)
steady_state_db = obtainGeneCounts(expressed_gene_exon_db,species,exp_dbase,array_count,normalize_feature_exp,excludeLowExp=excludeLowExp); exp_dbase=[]
exportGeneCounts(steady_state_export,array_names,steady_state_db)
steady_state_db = calculateGeneRPKM(steady_state_db)
if exportRPKMs:
exportGeneCounts(steady_state_export,array_names,steady_state_db,dataType='RPKMs')
else:
exp_dbase, all_exp_features, array_count = importNormalizedCountData(exp_file,expressed_gene_exon_db)
steady_state_db = obtainGeneCounts(expressed_gene_exon_db,species,exp_dbase,array_count,normalize_feature_exp); exp_dbase=[]
exportGeneCounts(steady_state_export,array_names,steady_state_db)
return steady_state_db, all_exp_features
def exportGeneCounts(steady_state_export,headers,gene_count_db,dataType='counts'):
### In addition to RPKM gene-level data, export gene level counts and lengths (should be able to calculate gene RPKMs from this file)
if dataType=='counts':
export_path = string.replace(steady_state_export,'exp.','counts.')
else:
export_path = steady_state_export
export_data = export.ExportFile(export_path)
title = string.join(['Ensembl']+headers,'\t')+'\n'
export_data.write(title)
for gene in gene_count_db:
sample_counts=[]
for count_data in gene_count_db[gene]:
try: read_count,region_length = count_data
except Exception: read_count = count_data
sample_counts.append(str(read_count))
sample_counts = string.join([gene]+sample_counts,'\t')+'\n'
export_data.write(sample_counts)
export_data.close()
def importGeneCounts(filename,import_type):