forked from sjteresi/TE_Density_Old
-
Notifications
You must be signed in to change notification settings - Fork 0
/
distribution.py
190 lines (161 loc) · 6.54 KB
/
distribution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# By Scott Teresi
#----------------------------------------------------------
import re
from collections import deque # I am going to implement a deque so that I can efficiently add TEs to my data set.
# It won't be much use for lookup, not any better than a list, but it is good for constructing the set.
from density_algorithm import *
import csv
import time
import os
from multiprocessing import Process
#----------------------------------------------------------
gff_inputfile = 'camarosa_gff_data.gff'
transposons = deque()
TE_Table = 'TE_Table.csv'
Type_Table = 'Type_Table.csv'
Family_Table = 'Family_Table.csv'
#----------------------------------------------------------
def te_handler():
# One important caveat, I don't think I am grabbing the '1' class, of which there are 27 occurences, check with Pat.
with open(gff_inputfile, 'r') as f_in:
number = 1
for row in f_in:
row = re.split('\t+', row)
if row[0][:10] == '##sequence':
continue # remove weird sequence blocks
classification = str(row[2])
te_type = classification.split('/')[0]
if te_type == 'unknown':
te_type = 'Unknown'
try:
family = classification.split('/')[1]
except IndexError:
family = 'Unknown'
if te_type == 'Simple_repeat':
continue # removes the simple_repeats
te_type = whitelist_type(te_type)
family = whitelist_fam(family)
if te_type == 'LTR' and family == 'Unknown':
family = 'LTRUnknown'
start = row[3]
chromosome = str(row[0])
stop = row[4]
confidence = row[5]
istart = int(start)
istop = int(stop)
length = istop - istart + 1
name = str('TE_' + str(number))
name = TE(name,chromosome,start,stop,length, te_type,family)
transposons.append(name)
number += 1
distribution()
def whitelist_type(te_type):
"""Renames types into the correct types that we want, 'whitelisting' the acceptable types."""
U = 'Unknown'
N = 'None'
master_type = {'RC?':'DNA','RC':'DNA', 'SINE?':U}
for key, val in master_type.items():
if te_type == key:
te_type = val
return te_type # returns te_type even if there is no match to anything
def whitelist_fam(family):
"""Renames families into the correct families that we want, 'whitelisting' the acceptable types."""
U = 'Unknown'
N = 'None'
master_fam = {'Uknown':U, 'MuDr':'MULE', 'MULE-MuDR':'MULE', 'Pao':U, 'Caulimovirus':U,
'hAT-Tag1':'hAT', 'hAT-Tip100':'hAT', 'hAT-Charlie':'hAT', 'Helitron':U, 'unknown':U, 'Maverick':U, 'Harbinger':'PIF-Harbinger', 'TcMar-Pogo':U, 'CR1':'LINE', 'hAT-Ac':'hAT',
'L2':'LINE', 'L1':'LINE', 'Jockey':'LINE', 'MuLE-MuDR':'MULE', 'MuDR':'MULE', 'Mutator':'MULE', 'Micro_like':U}
for key,val in master_fam.items():
if family == key:
family = val
return family
def distribution():
with open(TE_Table, 'w') as f_out:
fieldnames = ['number', 'chromosome', 'start', 'stop', 'length', 'te_type', 'family'] # Supply the header to the file, this also orders your output columns in the supplied order. Must have all attributes listed.
# You will need to edit the fieldnames to match your data
# You may not leave out any attributes/fieldnames, otherwise it will not work.
f_out = csv.DictWriter(f_out,fieldnames=fieldnames) # use the DictWriter module and declare your fieldnames
f_out.writeheader() # write the header
for elem in transposons: # iterate over your structure (transposons) and for every element write a row
f_out.writerow(elem.__dict__) # elem.__dict__ is how we access all of the attributes of an instance
def te_distribution():
with open(TE_Table,'r') as f_in:
reader = csv.reader(f_in, delimiter=',')
type_dict = {}
family_dict = {}
next(f_in)
for row in reader:
the_type = str(row[5])
the_family = str(row[6])
if the_type in type_dict:
type_dict[the_type] += 1
else:
type_dict[the_type] = 0
if the_family in family_dict:
family_dict[the_family] += 1
else:
family_dict[the_family] = 0
type_list = []
for key,val in type_dict.items():
type_list.append(key)
family_list = []
for key,val in family_dict.items():
family_list.append(key)
with open(Type_Table,'w') as f_out:
writer = csv.DictWriter(f_out, fieldnames=type_list)
writer.writeheader()
writer.writerow(type_dict)
with open(Family_Table,'w') as f_out:
writer = csv.DictWriter(f_out,fieldnames=family_list)
writer.writeheader()
writer.writerow(family_dict)
class Genic_Element(object):
def __init__(self, number, chromosome, start, stop ):
self.number = number
self.chromosome = chromosome
self.start = int(start)
self.stop = int(stop)
def getNumber(self):
return self.number
def getChromosome(self):
return self.chromosome
def getStart(self):
return self.start
def getStop(self):
return self.stop
def getLength(self):
return self.length
class TE(Genic_Element):
def __init__(self, number, chromosome, start, stop, length, te_type, family):
super().__init__(number, chromosome, start, stop)
self.te_type = te_type
self.family = family
self.length = length
def getTe_Type(self):
return self.te_type
def getFamily(self):
return self.family
def run_all():
te_handler()
te_distribution()
#---------------------------------------------------------
def stats():
LTR_Count = 0
Unknown_LTR_Count = 0
All_Unknown_fam = 0
All_Unknown = 0
for elem in transposons:
if elem.getTe_Type() == 'LTR':
LTR_Count += 1
if elem.getFamily() == 'Unknown':
Unknown_LTR_Count += 1
if elem.getFamily() == 'Unknown':
All_Unknown_fam += 1
if elem.getTe_Type() == 'Unknown':
All_Unknown += 1
print('LTR_Count ' + str(LTR_Count))
print('Unknown_LTR_Count ' + str(Unknown_LTR_Count))
print('All_Unknown_fam ' + str(All_Unknown_fam))
print('All_Unknown ' + str(All_Unknown))
if __name__ == '__main__':
run_all()