forked from NAL-i5K/GFF3toolkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
single_feature.py
executable file
·121 lines (108 loc) · 4.86 KB
/
single_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#! /usr/env/bin python3
"""
QC functions for processing every single feature in GFF3 file.
"""
import re
import logging
logger = logging.getLogger(__name__)
#log.basicConfig(level=logging.DEBUG, format='%(levelname)-8s %(message)s')
logger.setLevel(logging.INFO)
if not logger.handlers:
lh = logging.StreamHandler()
lh.setFormatter(logging.Formatter('%(levelname)-8s %(message)s'))
logger.addHandler(lh)
import gff3tool.lib.function4gff as function4gff
import gff3tool.lib.ERROR as ERROR
ERROR_INFO = ERROR.INFO
def FIX_PSEUDOGENE(gff):
roots = []
for line in gff.lines:
try:
if line['line_type'] == 'feature' and 'Parent' not in line['attributes']:
if len(line['attributes']) != 0:
roots.append(line)
else:
print('WARNING [Missing Attributes] Program failed.\n\t\t- Line {0:s}: {1:s}'.format(str(line['line_index']+1), line['line_raw']))
except KeyError:
print('WARNING [Missing Attributes] Program failed.\n\t\t- Line {0:s}: {1:s}'.format(str(line['line_index']+1), line['line_raw']))
#roots = [line for line in gff.lines if line['line_type']=='feature' and 'Parent' not in line['attributes']]
for root in roots:
if root['type'] == 'pseudogene':
for child in root['children']:
if child['type'] == 'mRNA' or child['type'] == 'transcript':
child['type'] = 'pseudogenic_transcript'
for grandchild in child['children']:
if grandchild['type'] == 'CDS':
grandchild['line_status'] = 'removed'
elif grandchild['type'] == 'exon':
grandchild['type'] = 'pseudogenic_exon'
others = gff.collect_descendants(grandchild)
for other in others:
other['line_status'] = 'removed'
def check_pseudogene(gff, line):
'''
Note:
1. This funtion should be only applied on a gff file that has been fixed by FIX_PSEUDOGENE function.
2. This function should be only applied on loci/transcript level features.
'''
eCode = 'Esf0001'
flag = 0
result=dict()
try:
for v in list(line['attributes'].values()):
if re.search(r"[Pp][Ss][EUeu][EUeu][Dd][Oo][Gg][Ee][Nn]*", str(v)):
flag += 1
if flag and not re.search(r"pseudogen*", line['type']):
result['ID'] = [line['attributes']['ID']]
result['line_num'] = ['Line {0:s}'.format(str(line['line_index'] + 1))]
result['eCode'] = eCode
result['eLines'] = [line]
result['eTag'] = ERROR_INFO[eCode]
result['error_level'] = "Info"
gff.add_line_error(line, {'message': ERROR_INFO[eCode], 'error_type': 'FEATURE_TYPE', 'eCode': eCode}, log_level=logging.INFO)
except:
logger.error('Program dies at Line {0:s}: {1:s}'.format(str(line['line_index']+1), line['line_raw']))
if len(result):
return [result]
def check_strand(gff, line):
eCode = 'Esf0003'
result = dict()
try:
if line['strand'] is '+' or line['strand'] is '-':
pass
elif line['strand'] is '.' or line['strand'] is '?':
result['ID'] = [line['attributes']['ID']]
result['line_num'] = ['Line {0:s}'.format(str(line['line_index'] + 1))]
result['eCode'] = eCode
result['eLines'] = [line]
result['eTag'] = '{0:s}: legal chacracter, "{1:s}", found at the strand field'.format(ERROR_INFO[eCode], line['strand'])
result['error_level'] = "Error"
gff.add_line_error(line, {'message': ERROR_INFO[eCode], 'error_type': 'FEATURE_TYPE', 'eCode': eCode})
else:
result['ID'] = [line['attributes']['ID']]
result['line_num'] = ['Line {0:s}'.format(str(line['line_index'] + 1))]
result['eCode'] = eCode
result['eLines'] = [line]
result['eTag'] = '{0:s}: illegal chacracter, "{1:s}" found at the strand field'.format(ERROR_INFO[eCode], line['strand'])
result['error_level'] = "Error"
gff.add_line_error(line, {'message': ERROR_INFO[eCode], 'error_type': 'FEATURE_TYPE', 'eCode': eCode})
except:
logger.error('Program dies at Line {0:s}: {1:s}'.format(str(line['line_index']+1), line['line_raw']))
if len(result):
return [result]
def main(gff, logger=None):
function4gff.FIX_MISSING_ATTR(gff, logger=logger)
FIX_PSEUDOGENE(gff)
features = [line for line in gff.lines if line['line_type']=='feature']
error_set=list()
for f in features:
r = check_pseudogene(gff, f)
if not r == None:
error_set.extend(r)
r = None
r = check_strand(gff, f)
if not r == None:
error_set.extend(r)
r = None
if len(error_set):
return(error_set)